Citation¶

Much of the code and examples are copied/modified from

Blueprints for Text Analytics Using Python by Jens Albrecht, Sidharth Ramachandran, and Christian Winkler (O'Reilly, 2021), 978-1-492-07408-3.

  • https://github.com/blueprints-for-text-analytics-python/blueprints-text
  • https://github.com/blueprints-for-text-analytics-python/blueprints-text/blob/master/ch08/Topic_Modeling_Clustering.ipynb

Configuration¶

In [1]:
# this variable controls the range of n-grams used by CountVectorizer/TfidfVectorizer
# and, therefore, the n-grams the topic modeling will use
n_gram_range = (1, 3)
# specify stop words specific to this dataset
custom_stop_words = {'united', 'nations', 'nation'}
# specify the number of topics the NMF/LDA will create
number_of_topics = 10

Setup¶

In [2]:
cd ../..
/Users/shanekercheval/repos/nlp-template
In [3]:
%run "source/config/notebook_settings.py"
In [4]:
pd.set_option('display.max_colwidth', None)
In [5]:
from source.library.utilities import Timer, get_logger
from source.library.text_analysis import count_tokens, tf_idf, get_context_from_keyword, count_keywords, count_keywords_by, impurity
from source.library.sklearn_topic_modeling import *
In [6]:
with Timer("Loading Data"):
    path = 'artifacts/data/processed/un-general-debates-paragraphs.pkl'
    paragraphs = pd.read_pickle(path)
Started: Loading Data
Finished (0.12 seconds)

Exploratory Data Analysis¶

This section provides a basic exploration of the text and dataset.

Dataset Summary¶

In [7]:
hlp.pandas.numeric_summary(paragraphs)
Out[7]:
  # of Non-Nulls # of Nulls % Nulls # of Zeros % Zeros Mean St Dev. Coef of Var Skewness Kurtosis Min 10% 25% 50% 75% 90% Max
year 279,045 0 0.0% 0 0.0% 1,992.4 12.6 0.0 0.1 -1.1 1,970 1,975.0 1,982.0 1,993.0 2,003.0 2,010.0 2,015
In [8]:
hlp.pandas.non_numeric_summary(paragraphs)
Out[8]:
  # of Non-Nulls # of Nulls % Nulls Most Freq. Value # of Unique % Unique
country 279,045 0 0.0% Russian Federation 199 0.1%
text 279,045 0 0.0% The President returned to the [...] 278,820 99.9%
In [9]:
assert not (paragraphs['text'].str.strip() == '').any()

Prep¶

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
In [11]:
stopwords |= custom_stop_words
stopwords |= {'ll', 've'}

Sample¶

In [12]:
paragraphs = paragraphs.sample(2000)
#paragraphs.to_pickle('source/tests/test_files/datasets/un_debates_paragraphs_sample.pkl')

TF / TF-IDF¶

NOTE: TF seems to be used with LDA rather than TF-IDF

In [13]:
with Timer("Calculating TF & TF-IDF (1-3 ngrams)"):
    count_vectorizer = CountVectorizer(stop_words=stopwords, ngram_range=(1, 3), min_df=5, max_df=0.7)
    count_vectors = count_vectorizer.fit_transform(paragraphs["text"])
    print(count_vectors.shape)

    tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(1, 3), min_df=5, max_df=0.7)
    tfidf_vectors = tfidf_vectorizer.fit_transform(paragraphs["text"])
    tfidf_vectors.shape
Started: Calculating TF & TF-IDF (1-3 ngrams)
(2000, 3531)
Finished (0.29 seconds)

Topic Modeling¶

In [14]:
import matplotlib.pyplot as plt
def plot_top_words(model, feature_names, n_top_words, title):
    """
    https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py
    """
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()
In [ ]:
 
In [15]:
def display_topics(model, features, no_top_words=5):
    for topic, words in enumerate(model.components_):
        total = words.sum()
        largest = words.argsort()[::-1] # invert sort order
        print("\nTopic %02d" % topic)
        for i in range(0, no_top_words):
            print("  %s (%2.2f)" % (features[largest[i]], abs(words[largest[i]]*100.0/total)))

NMF¶

In [16]:
from sklearn.decomposition import NMF

nmf_model = NMF(init='nndsvda', n_components=number_of_topics, random_state=42, max_iter=1000)
_ = nmf_model.fit_transform(tfidf_vectors)
nmf_feature_names = tfidf_vectorizer.get_feature_names_out()
In [17]:
plot_topics(
    model=nmf_model,
    features=nmf_feature_names,
    top_n_tokens=8,
    num_tokens_in_label=2
)
In [18]:
plot_topic_sizes(
    model=nmf_model,
    dataset=tfidf_vectors,
    features=nmf_feature_names,
)
In [19]:
predicted_topics = nmf_model.transform(X=tfidf_vectors)
per_document_totals = predicted_topics.sum(axis=1)
ax = pd.Series(per_document_totals).plot(kind='box', vert=False, figsize=(10, 1))
ax.set_title("Distribution Sum of Predicted Values/Topics Per Document")
ax.set_xlabel("Sum of Predicted Values Per Document")
ax.set_yticklabels([])
ax;
In [20]:
def get_topic_sizes_per_year(model, features, vectorizer):
    topic_labels = create_topic_labels(
        model=model,
        features=features,
        token_separator=' | ',
        top_n_tokens=2,
    ) 
    topic_labels.values()
    
    years = paragraphs['year'].unique()
    years.sort()
    
    def get_segment_sizes(paragraphs):
        new_data = vectorizer.transform(paragraphs)
        sizes = calculate_topic_sizes(model=model, dataset=new_data) 
        return sizes

    sizes_per_year = {year: get_segment_sizes(paragraphs.query(f'year == {year}')['text'])
                      for year in years}
    yearly_dict = {year: {topic: value
                          for topic, value in zip(topic_labels.values(), sizes_per_year[year])}
                   for year in years}
    df = pd.DataFrame(yearly_dict).reset_index().rename(columns={'index': 'topic_labels'})
    column_values = df.columns
    df = pd.melt(df, id_vars='topic_labels', value_vars=list(column_values), var_name='year')
    return df

topic_sizes_per_year = get_topic_sizes_per_year(
    model=nmf_model,
    features=nmf_feature_names,
    vectorizer=tfidf_vectorizer
)
topic_sizes_per_year.head()
Out[20]:
topic_labels year value
0 world | peace 1970 0.11
1 assembly | session 1970 0.06
2 countries | developing 1970 0.07
3 rights | human 1970 0.22
4 nuclear | weapons 1970 0.19
In [21]:
fig = px.area(
    topic_sizes_per_year,
    x="year",
    y="value",
    color="topic_labels",
    title="Topics Over Time",
)
fig.show()
In [22]:
fig = px.bar(
    topic_sizes_per_year,
    x="year",
    y="value",
    color="topic_labels",
    title="Topics Over Time",
)
fig.show()
In [23]:
fig = px.line(
    topic_sizes_per_year,
    x="year",
    y="value",
    color="topic_labels",
    title="Topics Over Time",
)
fig.show()
In [24]:
fig = px.scatter(
    topic_sizes_per_year,
    x="year",
    y="value",
    color="topic_labels",
    trendline="lowess",
    opacity=0.0,
    title="Topics Over Time",
)
fig.show()

In [25]:
import pyLDAvis.sklearn

lda_display = pyLDAvis.sklearn.prepare(nmf_model, tfidf_vectors, tfidf_vectorizer, sort_topics=False)
# pyLDAvis.display(lda_display)

file_name = f"docs/models/nmf-n-grams-{n_gram_range[0]}-{n_gram_range[1]}.html"
pyLDAvis.save_html(lda_display, file_name)
/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/sklearn/utils/deprecation.py:87: FutureWarning:

Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.

/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/pandas/core/internals/blocks.py:402: RuntimeWarning:

divide by zero encountered in log

/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/pyLDAvis/_prepare.py:246: FutureWarning:

In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.

/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/pandas/core/internals/blocks.py:402: RuntimeWarning:

divide by zero encountered in log

/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses
  from imp import reload
/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses
  from imp import reload
/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses
  from imp import reload
/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses
  from imp import reload
/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses
  from imp import reload
/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses
  from imp import reload
/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses
  from imp import reload
/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses
  from imp import reload

LDA¶

Neither the book nor the example above uses TF-IDF with LDA, but do not specify why. Both use TF-IDF with NMF and then change to CountVectorizer with LDA

https://stackoverflow.com/questions/44781047/necessary-to-apply-tf-idf-to-new-documents-in-gensim-lda-model/44789327#44789327

LDA only needs a bag-of-word vector.

In [26]:
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components = 10, random_state=42)
_ = lda_model.fit_transform(count_vectors)
lda_feature_names = count_vectorizer.get_feature_names_out()
In [27]:
plot_topics(
    model=lda_model,
    features=lda_feature_names,
    top_n_tokens=8,
    num_tokens_in_label=2,
    token_separator=' | '
)
/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/plotly/io/_renderers.py:396: DeprecationWarning:

distutils Version classes are deprecated. Use packaging.version instead.

/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/plotly/io/_renderers.py:396: DeprecationWarning:

distutils Version classes are deprecated. Use packaging.version instead.

In [28]:
plot_topic_sizes(
    model=lda_model,
    dataset=count_vectors,
    features=lda_feature_names,
    top_n_tokens=3,
    token_separator=' | '
)
/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/plotly/io/_renderers.py:396: DeprecationWarning:

distutils Version classes are deprecated. Use packaging.version instead.

/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/plotly/io/_renderers.py:396: DeprecationWarning:

distutils Version classes are deprecated. Use packaging.version instead.

In [29]:
predicted_topics = lda_model.transform(X=count_vectors)
per_document_totals = predicted_topics.sum(axis=1)
ax = pd.Series(per_document_totals).plot(kind='box', vert=False, figsize=(10, 1))
ax.set_title("Distribution Sum of Predicted Values/Topics Per Document")
ax.set_xlabel("Sum of Predicted Values Per Document")
ax.set_yticklabels([])
ax;
In [30]:
topic_sizes_per_year = get_topic_sizes_per_year(
    model=lda_model,
    features=lda_feature_names,
    vectorizer=count_vectorizer
)
topic_sizes_per_year.head()
Out[30]:
topic_labels year value
0 countries | human 1970 0.10
1 international | peace 1970 0.13
2 peace | people 1970 0.07
3 nuclear | weapons 1970 0.13
4 general | assembly 1970 0.05
In [31]:
fig = px.area(
    topic_sizes_per_year,
    x="year",
    y="value",
    color="topic_labels",
    title="Topics Over Time",
)
fig.show()
/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/plotly/io/_renderers.py:396: DeprecationWarning:

distutils Version classes are deprecated. Use packaging.version instead.

/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/plotly/io/_renderers.py:396: DeprecationWarning:

distutils Version classes are deprecated. Use packaging.version instead.

In [32]:
fig = px.bar(
    topic_sizes_per_year,
    x="year",
    y="value",
    color="topic_labels",
    title="Topics Over Time",
)
fig.show()
/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/plotly/io/_renderers.py:396: DeprecationWarning:

distutils Version classes are deprecated. Use packaging.version instead.

/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/plotly/io/_renderers.py:396: DeprecationWarning:

distutils Version classes are deprecated. Use packaging.version instead.

In [33]:
fig = px.line(
    topic_sizes_per_year,
    x="year",
    y="value",
    color="topic_labels",
    title="Topics Over Time",
)
fig.show()
/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/plotly/io/_renderers.py:396: DeprecationWarning:

distutils Version classes are deprecated. Use packaging.version instead.

/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/plotly/io/_renderers.py:396: DeprecationWarning:

distutils Version classes are deprecated. Use packaging.version instead.

In [34]:
fig = px.scatter(
    topic_sizes_per_year,
    x="year",
    y="value",
    color="topic_labels",
    trendline="lowess",
    opacity=0.0,
    title="Topics Over Time",
)
fig.show()
/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/plotly/io/_renderers.py:396: DeprecationWarning:

distutils Version classes are deprecated. Use packaging.version instead.

/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/plotly/io/_renderers.py:396: DeprecationWarning:

distutils Version classes are deprecated. Use packaging.version instead.


In [35]:
import pyLDAvis.sklearn

lda_display = pyLDAvis.sklearn.prepare(lda_model, count_vectors, count_vectorizer, sort_topics=False)
# pyLDAvis.display(lda_display)

file_name = f"docs/models/lda-n-grams-{n_gram_range[0]}-{n_gram_range[1]}.html"
pyLDAvis.save_html(lda_display, file_name)
/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/sklearn/utils/deprecation.py:87: FutureWarning:

Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.

/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/pyLDAvis/_prepare.py:246: FutureWarning:

In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.